import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as px
%matplotlib inline
import cufflinks as cf
import chart_studio.plotly as py
import plotly.express as px
import plotly.graph_objects as go
import statistics as stat
nd = pd.read_csv("/Users/lakshmiprasanna/Documents/netflix/netflix.csv",encoding='latin-1')
nd
| title | rating | ratingLevel | ratingDescription | release year | user rating score | user rating size | |
|---|---|---|---|---|---|---|---|
| 0 | White Chicks | PG-13 | crude and sexual humor, language and some drug... | 80 | 2004 | 82.0 | 80 |
| 1 | Lucky Number Slevin | R | strong violence, sexual content and adult lang... | 100 | 2006 | NaN | 82 |
| 2 | Grey's Anatomy | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2016 | 98.0 | 80 |
| 3 | Prison Break | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2008 | 98.0 | 80 |
| 4 | How I Met Your Mother | TV-PG | Parental guidance suggested. May not be suitab... | 70 | 2014 | 94.0 | 80 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 995 | The BFG | PG | for action/peril, some scary moments and brief... | 60 | 2016 | 97.0 | 80 |
| 996 | The Secret Life of Pets | PG | for action and some rude humor | 60 | 2016 | NaN | 81 |
| 997 | Precious Puppies | TV-G | Suitable for all ages. | 35 | 2003 | NaN | 82 |
| 998 | Beary Tales | TV-G | Suitable for all ages. | 35 | 2013 | NaN | 82 |
| 999 | Growing Up Wild | G | NaN | 35 | 2016 | 80.0 | 80 |
1000 rows × 7 columns
nd['user rating score'].isnull().sum()
395
nd = nd.dropna(subset='user rating score')
nd = nd.reset_index(drop=True)
nd.columns
Index(['title', 'rating', 'ratingLevel', 'ratingDescription', 'release year',
'user rating score', 'user rating size'],
dtype='object')
nd['rating'].unique()
array(['PG-13', 'TV-14', 'TV-PG', 'TV-MA', 'NR', 'TV-Y', 'TV-Y7-FV', 'PG',
'R', 'TV-G', 'G', 'TV-Y7'], dtype=object)
nd['user rating score'].unique()
array([82., 98., 94., 95., 97., 91., 96., 77., 88., 80., 74., 81., 57.,
84., 83., 99., 89., 92., 62., 90., 79., 93., 61., 78., 66., 75.,
63., 68., 71., 59., 73., 86., 69., 58., 55., 56., 64., 85., 70.,
67., 72., 65.])
nd['user rating score'] = nd['user rating score'].astype('int64')
nd['user rating size'].unique()
array([80])
nd.head()
| title | rating | ratingLevel | ratingDescription | release year | user rating score | user rating size | |
|---|---|---|---|---|---|---|---|
| 0 | White Chicks | PG-13 | crude and sexual humor, language and some drug... | 80 | 2004 | 82 | 80 |
| 1 | Grey's Anatomy | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2016 | 98 | 80 |
| 2 | Prison Break | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2008 | 98 | 80 |
| 3 | How I Met Your Mother | TV-PG | Parental guidance suggested. May not be suitab... | 70 | 2014 | 94 | 80 |
| 4 | Supernatural | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2016 | 95 | 80 |
array_rateDesc = list(nd['ratingDescription'].unique())
array_rateDesc.sort(reverse=True)
array_rateDesc
[124, 110, 100, 90, 80, 70, 60, 42, 41, 35, 10]
array_ratings = []
for i in array_rateDesc:
array_ratings.append(nd.loc[nd['ratingDescription'] == i, 'rating'].values[0])
print(array_ratings)
['NR', 'TV-MA', 'R', 'TV-14', 'PG-13', 'TV-PG', 'PG', 'TV-Y7-FV', 'TV-Y7', 'TV-G', 'TV-Y']
dict = {r:rd for r,rd in zip(array_ratings,array_rateDesc)}
print(dict)
{'NR': 124, 'TV-MA': 110, 'R': 100, 'TV-14': 90, 'PG-13': 80, 'TV-PG': 70, 'PG': 60, 'TV-Y7-FV': 42, 'TV-Y7': 41, 'TV-G': 35, 'TV-Y': 10}
nd
| title | rating | ratingLevel | ratingDescription | release year | user rating score | user rating size | |
|---|---|---|---|---|---|---|---|
| 0 | White Chicks | PG-13 | crude and sexual humor, language and some drug... | 80 | 2004 | 82 | 80 |
| 1 | Grey's Anatomy | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2016 | 98 | 80 |
| 2 | Prison Break | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2008 | 98 | 80 |
| 3 | How I Met Your Mother | TV-PG | Parental guidance suggested. May not be suitab... | 70 | 2014 | 94 | 80 |
| 4 | Supernatural | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2016 | 95 | 80 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 600 | Pok̩mon: Indigo League | TV-Y | Suitable for all ages. | 10 | 2000 | 74 | 80 |
| 601 | Paddington | PG | mild action and rude humor | 60 | 2014 | 70 | 80 |
| 602 | Dawn of the Croods | TV-Y7 | Suitable for children ages 7 and older | 41 | 2017 | 72 | 80 |
| 603 | The BFG | PG | for action/peril, some scary moments and brief... | 60 | 2016 | 97 | 80 |
| 604 | Growing Up Wild | G | NaN | 35 | 2016 | 80 | 80 |
605 rows × 7 columns
rate_count = []
rate_sum = []
avg_rate = []
median_rate = []
most_repeated_rate = []
for i in array_ratings:
rate_count.append(nd[nd['rating']==i]['user rating score'].count())
rate_sum.append(nd[nd['rating']==i]['user rating score'].sum())
avg_rate.append(nd[nd['rating']==i]['user rating score'].mean())
median_rate.append(nd[nd['rating']==i]['user rating score'].median())
most_repeated_rate.append(nd[nd['rating']==i]['user rating score'].mode())
rate_count
[4, 96, 9, 188, 11, 42, 119, 16, 14, 24, 25]
diction = {'Rating genre': array_ratings,'Count' : rate_count, 'Overall ratings sum' : rate_sum,
'Average rating' : avg_rate, 'Median rating' : median_rate}
ratings_df = pd.DataFrame(diction)
ratings_df
| Rating genre | Count | Overall ratings sum | Average rating | Median rating | |
|---|---|---|---|---|---|
| 0 | NR | 4 | 348 | 87.000000 | 97.0 |
| 1 | TV-MA | 96 | 8498 | 88.520833 | 93.0 |
| 2 | R | 9 | 784 | 87.111111 | 91.0 |
| 3 | TV-14 | 188 | 16248 | 86.425532 | 91.0 |
| 4 | PG-13 | 11 | 814 | 74.000000 | 77.0 |
| 5 | TV-PG | 42 | 3606 | 85.857143 | 92.0 |
| 6 | PG | 119 | 10258 | 86.201681 | 91.0 |
| 7 | TV-Y7-FV | 16 | 1205 | 75.312500 | 73.0 |
| 8 | TV-Y7 | 14 | 1035 | 73.928571 | 72.0 |
| 9 | TV-G | 24 | 1788 | 74.500000 | 74.0 |
| 10 | TV-Y | 25 | 1890 | 75.600000 | 77.0 |
mode=[]
a = []
for i in range(len(most_repeated_rate)):
for j in range(len(most_repeated_rate[i])):
a.append(most_repeated_rate[i][j])
mode.append(stat.mean(a))
a=[]
mode
[97, 96, 88, 98, 82, 94, 97, 80, 72, 74, 81]
ratings_df['Mode'] = mode
ratings_df
| Rating genre | Count | Overall ratings sum | Average rating | Median rating | Mode | |
|---|---|---|---|---|---|---|
| 0 | NR | 4 | 348 | 87.000000 | 97.0 | 97 |
| 1 | TV-MA | 96 | 8498 | 88.520833 | 93.0 | 96 |
| 2 | R | 9 | 784 | 87.111111 | 91.0 | 88 |
| 3 | TV-14 | 188 | 16248 | 86.425532 | 91.0 | 98 |
| 4 | PG-13 | 11 | 814 | 74.000000 | 77.0 | 82 |
| 5 | TV-PG | 42 | 3606 | 85.857143 | 92.0 | 94 |
| 6 | PG | 119 | 10258 | 86.201681 | 91.0 | 97 |
| 7 | TV-Y7-FV | 16 | 1205 | 75.312500 | 73.0 | 80 |
| 8 | TV-Y7 | 14 | 1035 | 73.928571 | 72.0 | 72 |
| 9 | TV-G | 24 | 1788 | 74.500000 | 74.0 | 74 |
| 10 | TV-Y | 25 | 1890 | 75.600000 | 77.0 | 81 |
TV-14 : TV-14 stands for content that may be inappropriate for children younger than 14 years of age.
PG : PG stands for content that should be watched under parental guidance.
TV-MA: TV-MA is a rating assigned by the TV Parental Guidelines to a television program that was designed for mature audiences only.
(I'm considering median because it gives the best idea of what people think of the genre)
NR : NR rating refers to a movie that has not yet been rated. This could mean that the movie has not been submitted for a rating or is an uncut version of a movie that was submitted.
TV-MA : TV-MA is a rating assigned by the TV Parental Guidelines to a television program that was designed for mature audiences only.
TV-PG : TV-PG is a rating for the TV shows that are to be watched under parental guidance.
ratings_df['Weighted average'] = np.random.rand(len(ratings_df))
for i in range(len(ratings_df)):
ratings_df['Weighted average'][i] = (ratings_df['Count'][i]+ratings_df['Median rating'][i])/2
ratings_df
| Rating genre | Count | Overall ratings sum | Average rating | Median rating | Mode | Weighted average | |
|---|---|---|---|---|---|---|---|
| 0 | NR | 4 | 348 | 87.000000 | 97.0 | 97 | 50.5 |
| 1 | TV-MA | 96 | 8498 | 88.520833 | 93.0 | 96 | 94.5 |
| 2 | R | 9 | 784 | 87.111111 | 91.0 | 88 | 50.0 |
| 3 | TV-14 | 188 | 16248 | 86.425532 | 91.0 | 98 | 139.5 |
| 4 | PG-13 | 11 | 814 | 74.000000 | 77.0 | 82 | 44.0 |
| 5 | TV-PG | 42 | 3606 | 85.857143 | 92.0 | 94 | 67.0 |
| 6 | PG | 119 | 10258 | 86.201681 | 91.0 | 97 | 105.0 |
| 7 | TV-Y7-FV | 16 | 1205 | 75.312500 | 73.0 | 80 | 44.5 |
| 8 | TV-Y7 | 14 | 1035 | 73.928571 | 72.0 | 72 | 43.0 |
| 9 | TV-G | 24 | 1788 | 74.500000 | 74.0 | 74 | 49.0 |
| 10 | TV-Y | 25 | 1890 | 75.600000 | 77.0 | 81 | 51.0 |
go.Figure(data=go.Box(x=nd['rating'],y=nd['user rating score']))
traces = []
for column in ratings_df.columns[3:]:
trace = go.Scatter(x=ratings_df[column], y=ratings_df['Rating genre'], mode='lines', name=column)
traces.append(trace)
go.Figure(data=traces)
array_ratings
['NR', 'TV-MA', 'R', 'TV-14', 'PG-13', 'TV-PG', 'PG', 'TV-Y7-FV', 'TV-Y7', 'TV-G', 'TV-Y']
TV_shows = []
Movies_and_others = []
Type = ('TV shows', 'Movies and others')
for i in range(len(array_ratings)):
if (array_ratings[i][0] == 'T' and array_ratings[i][1] == 'V'):
TV_shows.append(array_ratings[i])
else:
Movies_and_others.append(array_ratings[i])
TV_shows
['TV-MA', 'TV-14', 'TV-PG', 'TV-Y7-FV', 'TV-Y7', 'TV-G', 'TV-Y']
Movies_and_others
['NR', 'R', 'PG-13', 'PG']
nd['type label'] = np.random.rand(len(nd))
for i in range(len(nd)):
if nd['rating'][i] in TV_shows:
nd['type label'][i] = "TV shows"
else:
nd['type label'][i] = "Movies and others"
nd
| title | rating | ratingLevel | ratingDescription | release year | user rating score | user rating size | type label | |
|---|---|---|---|---|---|---|---|---|
| 0 | White Chicks | PG-13 | crude and sexual humor, language and some drug... | 80 | 2004 | 82 | 80 | Movies and others |
| 1 | Grey's Anatomy | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2016 | 98 | 80 | TV shows |
| 2 | Prison Break | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2008 | 98 | 80 | TV shows |
| 3 | How I Met Your Mother | TV-PG | Parental guidance suggested. May not be suitab... | 70 | 2014 | 94 | 80 | TV shows |
| 4 | Supernatural | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2016 | 95 | 80 | TV shows |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 600 | Pok̩mon: Indigo League | TV-Y | Suitable for all ages. | 10 | 2000 | 74 | 80 | TV shows |
| 601 | Paddington | PG | mild action and rude humor | 60 | 2014 | 70 | 80 | Movies and others |
| 602 | Dawn of the Croods | TV-Y7 | Suitable for children ages 7 and older | 41 | 2017 | 72 | 80 | TV shows |
| 603 | The BFG | PG | for action/peril, some scary moments and brief... | 60 | 2016 | 97 | 80 | Movies and others |
| 604 | Growing Up Wild | G | NaN | 35 | 2016 | 80 | 80 | Movies and others |
605 rows × 8 columns
sns.set(style="darkgrid")
sns.countplot(x="type label", data=nd, palette="Set2")
<AxesSubplot:xlabel='type label', ylabel='count'>
labels = ['TV shows', 'Movies and others']
count = []
count.append(nd[nd['type label']=='TV shows']['type label'].count())
count.append(nd[nd['type label']=='Movies and others']['type label'].count())
lay = go.Layout(title='count')
go.Figure(data=go.Bar(x=labels,y=count),layout=lay)
top_5_max_values = nd['user rating score'].nlargest(5)
print(top_5_max_values)
27 99 95 99 119 99 145 99 192 99 Name: user rating score, dtype: int64
nd[nd['user rating score']==99]
| title | rating | ratingLevel | ratingDescription | release year | user rating score | user rating size | type label | |
|---|---|---|---|---|---|---|---|---|
| 27 | 13 Reasons Why | TV-MA | For mature audiences. May not be suitable for... | 110 | 2017 | 99 | 80 | TV shows |
| 95 | 13 Reasons Why | TV-MA | For mature audiences. May not be suitable for... | 110 | 2017 | 99 | 80 | TV shows |
| 119 | 13 Reasons Why | TV-MA | For mature audiences. May not be suitable for... | 110 | 2017 | 99 | 80 | TV shows |
| 145 | 13 Reasons Why | TV-MA | For mature audiences. May not be suitable for... | 110 | 2017 | 99 | 80 | TV shows |
| 192 | 13 Reasons Why | TV-MA | For mature audiences. May not be suitable for... | 110 | 2017 | 99 | 80 | TV shows |
| 241 | 13 Reasons Why | TV-MA | For mature audiences. May not be suitable for... | 110 | 2017 | 99 | 80 | TV shows |
| 280 | 13 Reasons Why | TV-MA | For mature audiences. May not be suitable for... | 110 | 2017 | 99 | 80 | TV shows |
| 364 | 13 Reasons Why | TV-MA | For mature audiences. May not be suitable for... | 110 | 2017 | 99 | 80 | TV shows |
top_5_movies = nd[nd['type label']=='Movies and others']['user rating score'].nlargest(5)
print(top_5_movies)
64 98 190 98 237 98 433 98 470 98 Name: user rating score, dtype: int64
nd[(nd['type label']=='Movies and others') & (nd['user rating score']==98)]
| title | rating | ratingLevel | ratingDescription | release year | user rating score | user rating size | type label | |
|---|---|---|---|---|---|---|---|---|
| 64 | Finding Dory | PG | mild thematic elements | 60 | 2016 | 98 | 80 | Movies and others |
| 190 | Finding Dory | PG | mild thematic elements | 60 | 2016 | 98 | 80 | Movies and others |
| 237 | Finding Dory | PG | mild thematic elements | 60 | 2016 | 98 | 80 | Movies and others |
| 433 | Finding Dory | PG | mild thematic elements | 60 | 2016 | 98 | 80 | Movies and others |
| 470 | Finding Dory | PG | mild thematic elements | 60 | 2016 | 98 | 80 | Movies and others |
#dict
sns.countplot(nd['rating'])
/Users/lakshmiprasanna/opt/anaconda3/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
<AxesSubplot:xlabel='rating', ylabel='count'>
sns.barplot(data=nd,x='user rating score',y='rating')
<AxesSubplot:xlabel='user rating score', ylabel='rating'>
go.Figure(data=go.Bar(x=ratings_df['Rating genre'],y=ratings_df['Count']))
nd.head()
| title | rating | ratingLevel | ratingDescription | release year | user rating score | user rating size | type label | |
|---|---|---|---|---|---|---|---|---|
| 0 | White Chicks | PG-13 | crude and sexual humor, language and some drug... | 80 | 2004 | 82 | 80 | Movies and others |
| 1 | Grey's Anatomy | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2016 | 98 | 80 | TV shows |
| 2 | Prison Break | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2008 | 98 | 80 | TV shows |
| 3 | How I Met Your Mother | TV-PG | Parental guidance suggested. May not be suitab... | 70 | 2014 | 94 | 80 | TV shows |
| 4 | Supernatural | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2016 | 95 | 80 | TV shows |
year = list(nd['release year'].unique())
year.sort()
count_year = []
for i in year:
count_year.append(nd[nd['release year']==i]['title'].count())
go.Figure(data=go.Bar(x=year,y=count_year))
grouped = nd.groupby('type label')
traces = []
for category, group in grouped:
trace = go.Histogram(x=group['release year'], name=category)
traces.append(trace)
fig = go.Figure(data=traces)
fig.show()
grouped = nd.groupby('rating')
traces = []
for category, group in grouped:
trace = go.Histogram(x=group['release year'], name=category)
traces.append(trace)
fig = go.Figure(data=traces)
fig.show()
for i in array_ratings:
selected_values = nd['release year'].loc[nd['rating']==i]
histogram = go.Histogram(x=selected_values)
layout = go.Layout(title=f'Histogram for {i}')
fig = go.Figure(data=[histogram], layout=layout)
fig.show()
nd.head()
| title | rating | ratingLevel | ratingDescription | release year | user rating score | user rating size | type label | |
|---|---|---|---|---|---|---|---|---|
| 0 | White Chicks | PG-13 | crude and sexual humor, language and some drug... | 80 | 2004 | 82 | 80 | Movies and others |
| 1 | Grey's Anatomy | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2016 | 98 | 80 | TV shows |
| 2 | Prison Break | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2008 | 98 | 80 | TV shows |
| 3 | How I Met Your Mother | TV-PG | Parental guidance suggested. May not be suitab... | 70 | 2014 | 94 | 80 | TV shows |
| 4 | Supernatural | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2016 | 95 | 80 | TV shows |
nd_tv14 = nd[nd['rating']=='TV-14'].reset_index(drop=True)
nd_tv14
| title | rating | ratingLevel | ratingDescription | release year | user rating score | user rating size | type label | |
|---|---|---|---|---|---|---|---|---|
| 0 | Grey's Anatomy | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2016 | 98 | 80 | TV shows |
| 1 | Prison Break | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2008 | 98 | 80 | TV shows |
| 2 | Supernatural | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2016 | 95 | 80 | TV shows |
| 3 | The Vampire Diaries | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2017 | 91 | 80 | TV shows |
| 4 | Pretty Little Liars | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2016 | 96 | 80 | TV shows |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 183 | Madam Secretary | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2016 | 70 | 80 | TV shows |
| 184 | Rosewood | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2015 | 81 | 80 | TV shows |
| 185 | Quantico | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2016 | 88 | 80 | TV shows |
| 186 | Limitless | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2016 | 84 | 80 | TV shows |
| 187 | Greenleaf | TV-14 | Parents strongly cautioned. May be unsuitable ... | 90 | 2016 | 62 | 80 | TV shows |
188 rows × 8 columns
count_16 = nd_tv14[nd_tv14['release year']==2016]['user rating score'].count()
median_16 = nd_tv14[nd_tv14['release year']==2016]['user rating score'].median()
count_17 = nd_tv14[nd_tv14['release year']==2017]['user rating score'].count()
median_17 = nd_tv14[nd_tv14['release year']==2017]['user rating score'].median()
count_15 = nd_tv14[nd_tv14['release year']==2015]['user rating score'].count()
median_15 = nd_tv14[nd_tv14['release year']==2015]['user rating score'].median()
count = []
count.append([count_15,count_16,count_17])
count = [obj for sublist in count for obj in sublist]
median = []
median.append([median_15,median_16,median_17])
median = [obj for sublist in median for obj in sublist]
temp_dict = {'year': [2015,2016,2017],'count': count, 'median': median}
temp = pd.DataFrame(temp_dict)
temp = temp.set_index('year')
print(temp)
count median year 2015 23 83.0 2016 88 91.0 2017 10 91.0
for i in array_ratings:
selected_values = nd['user rating score'].loc[nd['rating']==i]
histogram = go.Histogram(x=selected_values)
layout = go.Layout(title=f'Histogram for {i}')
fig = go.Figure(data=[histogram], layout=layout)
fig.show()